Processing methylation data

Here we will use an existing methylation dataset - 450k in cord blood (15 samples)
To read in your own dataset (usually “idat files” ending in .idat)
See the help for ?read.metharray.exp

Load packages that we will use focusing on minfi:
see Aryee et al. Bioinformatics 2014.
Other popular options for processing & analyzing methylation data include RnBeads and methylumi

suppressMessages(library(minfi)) # popular package for methylation data
library(FlowSorted.CordBlood.450k) # example dataset
library(pryr) # for monitoring memory use
library(ENmix) # probe type adjustment "rcp"
## Loading required package: doParallel
suppressMessages(require(sva)) # for addressing batch effects

bring a 450k dataset in to memory (from eponymous BioC package above)
see Bakulski et al. Epigenetics 2016.

data(FlowSorted.CordBlood.450k)

because it is flow sorted - the authors give us the cell types
here we show the frequencies:

table(pData(FlowSorted.CordBlood.450k)$CellType)
## 
##      Bcell       CD4T       CD8T       Gran       Mono         NK 
##         15         15         14         12         15         14 
##       nRBC WholeBlood 
##          4         15
# subset to just the Whole Blood samples since this is the most common for epi studies
WB <- FlowSorted.CordBlood.450k[, pData(FlowSorted.CordBlood.450k)$CellType == "WholeBlood"]
ncol(WB)
## Samples 
##      15

we need to change the sampleName attribute here just because we are using a reference and these samples are otherwise in the reference dataset when we want to estimate their composition.

# your personal samples won't need to be renamed
sampleNames(WB) <- 1:15

Look at the attributes of this dataset
It is stored as a RGChannelSet which means it is not yet processed (red and green signals stored separately)

WB
## RGChannelSet (storageMode: lockedEnvironment)
## assayData: 622399 features, 15 samples 
##   element names: Green, Red 
## An object of class 'AnnotatedDataFrame'
##   sampleNames: 1 2 ... 15 (15 total)
##   varLabels: X Plate_ID ... CellType (13 total)
##   varMetadata: labelDescription
## Annotation
##   array: IlluminaHumanMethylation450k
##   annotation: ilmn12.hg19

Estimate cell proportions

Estimating proportions of 7 cell types found in cord blood (note the nucleated Red Blood Cells)
This next command is commented out because it requires >4GB of RAM
if you don’t have that - you can load the presaved output below

# cellprop <- estimateCellCounts(WB, compositeCellType = "CordBlood",
#   cellTypes = c("CD8T","CD4T", "NK","Bcell","Mono","Gran", "nRBC"))
# write.csv(cellprop, file = "data/cellprop_WB_15samps_bakulski2016.csv", row.names = F)

read in the estimated cell proportions:

cellprop <- read.csv("data/cellprop_WB_15samps_bakulski2016.csv")

drop the reference dataset from memory

rm(FlowSorted.CordBlood.450k)

Here are the estimates

knitr::kable(cellprop, digits = 2)
CD8T CD4T NK Bcell Mono Gran nRBC
0.13 0.22 0.00 0.07 0.08 0.44 0.09
0.11 0.08 0.00 0.08 0.10 0.55 0.11
0.15 0.10 0.05 0.05 0.08 0.53 0.07
0.25 0.14 0.02 0.12 0.07 0.34 0.08
0.18 0.20 0.00 0.06 0.06 0.44 0.09
0.16 0.11 0.00 0.08 0.11 0.54 0.06
0.18 0.19 0.00 0.08 0.04 0.48 0.07
0.14 0.07 0.00 0.07 0.11 0.58 0.05
0.11 0.10 0.00 0.06 0.09 0.61 0.04
0.16 0.13 0.08 0.08 0.04 0.22 0.32
0.18 0.15 0.00 0.13 0.06 0.38 0.19
0.09 0.10 0.00 0.04 0.06 0.42 0.29
0.12 0.04 0.00 0.09 0.10 0.60 0.05
0.20 0.17 0.00 0.05 0.06 0.47 0.06
0.13 0.15 0.00 0.14 0.06 0.38 0.17

note that they are close to summing to 1

summary(rowSums(cellprop))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.002   1.013   1.021   1.026   1.031   1.099

Distribution of estimated cell types

boxplot(cellprop*100, col=1:ncol(cellprop),xlab="Cell type",ylab="Estimated %")

Preprocessing the data

Preprocess the data - this removes technical variation
There are several popular methods including intra- and inter-sample normalizations
Here we demonstrate an effective and lightweight approach:
“Normal out of band background” (Noob) within-sample correction - see Triche et al 2013

system.time(WB.noob <- preprocessNoob(WB))
## Loading required package: IlluminaHumanMethylation450kmanifest
## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19
## [preprocessNoob] Using sample number 6 as reference level...
##    user  system elapsed 
##  25.850   3.844  29.932

We see the resulting object is now a MethylSet (because the RGset has been preprocessed)
Minfi is incorrectly saying the data are still raw - we verify this is not true below

WB.noob
## MethylSet (storageMode: lockedEnvironment)
## assayData: 485512 features, 15 samples 
##   element names: Meth, Unmeth 
## An object of class 'AnnotatedDataFrame'
##   sampleNames: 1 2 ... 15 (15 total)
##   varLabels: X Plate_ID ... CellType (13 total)
##   varMetadata: labelDescription
## Annotation
##   array: IlluminaHumanMethylation450k
##   annotation: ilmn12.hg19
## Preprocessing
##   Method: Raw (no normalization or bg correction)
##   minfi version: 1.18.6
##   Manifest version: 0.4.0

we can look at a few methylation values on the fly and see that the preprocessing changed them:

# first three CpGs on the first three samples
# raw RGset
print(getBeta(WB)[1:3,1:3], digits = 2)
##               1    2     3
## cg00050873 0.56 0.54 0.838
## cg00212031 0.61 0.80 0.017
## cg00213748 0.32 0.49 0.912
# after preprocessing
print(getBeta(WB.noob)[1:3,1:3], digits = 2)
##               1    2     3
## cg00050873 0.51 0.51 0.871
## cg00212031 0.52 0.54 0.023
## cg00213748 0.47 0.50 0.904

Distribution of beta-values: before and after normalization

densityPlot(WB, main = "density plots before and after preprocessing", pal="blue")
densityPlot(WB.noob, add = F, pal = "red")
# Add legend
legend("topright", c("Noob","Raw"), 
  lty=c(1,1), title="Normalization", 
  bty='n', cex=0.8, col=c("red","blue"))

notice the blue density traces (raw) are more spread out; background correction brings them together
## probe failures due to low intensities We want to drop probes with intensity that is not significantly above background signal (from negative control probes)

detect.p <- detectionP(WB, type = "m+u")

Count of failed probes per sample P>0.01 (i.e. not significant compared to background signal)

knitr::kable(t(as.matrix(colSums(detect.p > 0.01))))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
452 369 108 134 409 226 167 341 372 313 237 396 420 95 119

This is less than 1% of probes per sample
Total number of failed measures (in all probes)

sum(detect.p > 0.01)
## [1] 4158

Restrict data to good probes only:

detect.p[detect.p > 0.01] <- NA
detect.p <- na.omit(detect.p)
intersect <- intersect(rownames(getAnnotation(WB)), rownames(detect.p))
length(intersect)
## [1] 483916

Filter bad probes from our methylset

nrow(WB.noob)
## Features 
##   485512
WB.noob <- WB.noob[rownames(getAnnotation(WB.noob)) %in% intersect,]
nrow(WB.noob)
## Features 
##   483916
# cleanup
rm(intersect, detect.p, WB)

Probe type adjustment

Need to adjust for probe-type bias Infinium I (type I) and Infinium II (type II) probes
RCP with EnMix: Regression on Correlated Probes Niu et al. Bioinformatics 2016

betas.rcp <- rcp(WB.noob)
dim(betas.rcp)
## [1] 483916     15

note that this package takes beta values out of the minfi object - result is a matrix

class(betas.rcp) 
## [1] "matrix"
## Annotation of Infinium type for each probe (I vs II)
typeI <-   minfi::getProbeInfo(WB.noob,type="I")$Name
typeII <-  minfi::getProbeInfo(WB.noob,type="II")$Name
onetwo <- rep(1, nrow(betas.rcp))
onetwo[rownames(betas.rcp) %in% typeII] <- 2
# almost three quarter of our probes are type II
knitr::kable(t(table(onetwo)))
1 2
135078 348838

Density plots by Infinium type: before and after RCP calibration
Probe-type bias adjustment before and after RCP

par(mfrow=c(1,2)) # Side-by-side density distributions 
densityPlot(WB.noob[rownames(getAnnotation(WB.noob)) %in% typeI,],pal = "red",main='Beta density')
densityPlot(WB.noob[rownames(getAnnotation(WB.noob)) %in% typeII,],add = F, pal = "blue")
densityPlot(betas.rcp[rownames(getAnnotation(WB.noob)) %in% typeI,],pal = "red",main='Beta density probe-type adjusted')
densityPlot(betas.rcp[rownames(getAnnotation(WB.noob)) %in% typeII,],add = F, pal = "blue")
legend("topright", c("Infinium I","Infinium II"), 
       lty=c(1,1), title="Infinium type", 
       bty='n',col=c("red","blue"))

notice that the type I and II peaks are more closely aligned after rcp adjustment
(particularly in the higher peak)

rm(onetwo, typeI, typeII)

Batch effects

As an example of an observable batch effect, samples are processed in plates (e.g. bisulfite converting 96 at a time).
This can create batch effects (technical variation) with different intensities by plate.
Other commonly observed batch effects include the position on the chip (e.g. the row effect).
Let’s check if samples were on different plates in these data:

knitr::kable(t(as.matrix(table(pData(WB.noob)$Plate_ID))), col.names = c("Plate 1","Plate2"))
Plate 1 Plate2
6 9

Principal Component Analysis (PCA)

Calculate major sources of variability of DNA methylation using PCA

PCobject = prcomp(t(betas.rcp), retx = T, center = T, scale. = T)

Extract the Principal Components from SVD

PCs <- PCobject$x

Proportion of variance explained by each additional PC

cummvar <- summary(PCobject)$importance["Cumulative Proportion", 1:10]
knitr::kable(t(as.matrix(cummvar)),digits = 2)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
0.4 0.53 0.59 0.65 0.7 0.74 0.78 0.81 0.85 0.88

Is the major source of variability associated with sample plate?

par(mfrow = c(1, 1))
boxplot(PCs[, 1] ~ pData(WB.noob)$Plate_ID,
        xlab = "Sample Plate", ylab = "PC1",
        col = c("red", "blue"))

t.test(PCs[, 1] ~ pData(WB.noob)$Plate_ID)
## 
##  Welch Two Sample t-test
## 
## data:  PCs[, 1] by pData(WB.noob)$Plate_ID
## t = -2.0899, df = 12.409, p-value = 0.05784
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -804.81998   15.29786
## sample estimates:
## mean in group 1 mean in group 2 
##       -236.8566        157.9044

Removing batch effects using ComBat from the sva package

# First we convert from beta-values to M-values
Mvals <- log2(betas.rcp)-log2(1-betas.rcp)

ComBat eBayes adjustment using a known variable of interest (here we use plate)

Mvals.ComBat <- ComBat(Mvals, batch = pData(WB.noob)$Plate_ID)
## Found 2 batches
## Adjusting for 0 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# Convert M-values back to beta-values
betas.rcp <- 2^Mvals.ComBat/(1+2^Mvals.ComBat)

PCA after removing batch effects

PCobject <- prcomp(t(betas.rcp), retx = T, center = T, scale. = T)
PCs <- PCobject$x

The first PC is no longer associated with sample plate

boxplot(PCs[,1] ~ pData(WB.noob)$Plate_ID,
        xlab = "Sample Plate", ylab = "PC1",
        col = c("red","blue"))

t.test(PCs[,1] ~ pData(WB.noob)$Plate_ID)
## 
##  Welch Two Sample t-test
## 
## data:  PCs[, 1] by pData(WB.noob)$Plate_ID
## t = -0.91292, df = 12.904, p-value = 0.378
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -645.1705  262.0809
## sample estimates:
## mean in group 1 mean in group 2 
##      -114.92688        76.61792

ComBat removed the apparent batch effect

#cleanup
rm(PCs, Mvals, cummvar, PCobject)

memory usage

as a reminder - these are large datasets and we are working in RAM.
Check memory usage with:

pryr::mem_used()
## 1.13 GB
# this command will check the maximum memory usage on Windows
memory.size(max = T)
## Warning: 'memory.size()' is Windows-specific
## [1] Inf

End of script 1

Analyze methylation data

Using data preprocessed in our script:
meth01_process_data.R

we have a processed dataset with 15 samples (otherwise we run script 01)

if(!exists("WB.noob")){
  source("code/meth01_process_data.R")
}
dim(WB.noob)
## Features  Samples 
##   483916       15

load packages

suppressPackageStartupMessages({
  library(CpGassoc)
  library(data.table)
  library(qqman)
  library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
  library(DMRcate)
  library(MASS) 
  library(sandwich) 
  library(lmtest) 
})

predict sex from methylation

Gbeta <- mapToGenome(WB.noob)  #map to the genome

getSex predicts sex based on X and Y chromosome methylation intensity

getSex(Gbeta) 
## DataFrame with 15 rows and 3 columns
##          xMed      yMed predictedSex
##     <numeric> <numeric>  <character>
## 1    13.81941  9.532875            F
## 2    13.85021  9.676420            F
## 3    13.23687 13.600281            M
## 4    13.29954 13.616983            M
## 5    13.69979  9.649878            F
## ...       ...       ...          ...
## 11   13.23359 13.552924            M
## 12   13.78387 10.039801            F
## 13   13.78433  9.997809            F
## 14   13.33468 13.610287            M
## 15   13.24118 13.561319            M

we see that our predictions match the phenodata

table(pData(WB.noob)$Sex,getSex(Gbeta)$predictedSex)
##    
##     F M
##   F 8 0
##   M 0 7
# cleanup
rm(Gbeta)

consolidate our phenodata

pheno <- as.data.frame(cbind(Sex=pData(WB.noob)$Sex, Plate_ID=pData(WB.noob)$Plate_ID, cellprop))
pheno[,"Sex"] <- ifelse(as.numeric(pheno[,"Sex"])==2,0,1)
# 1 female, 0 male

# cleanup
rm(WB.noob)

quick check of the distribution of gender between plates

table(pheno[,"Sex"], pheno[,"Plate_ID"])
##    
##     1 2
##   0 3 4
##   1 3 5
## Cleaning up the methylation data

Filters a matrix of beta values by distance to SNP. Also removes crosshybridising probes and sex-chromosome probes.

dim(betas.rcp)
## [1] 483916     15
betas.clean <- rmSNPandCH(betas.rcp,  mafcut = 0.05, and = TRUE, rmcrosshyb = TRUE, rmXY= TRUE)
nCpG <- dim(betas.clean)[1]
nCpG
## [1] 427140

Running an Epigenome Wide Association

Here we run an EWAS on sex (as a predictor of methylation)
First we can run a linear regression on a single CpG that we have already picked

j <- 133211
CpG.level <- betas.clean[j,]
CpG.name <- rownames(betas.clean)[j]
CpG.name
## [1] "cg12691488"

difference in methylation between males and females for this CpG some descriptive statistics

knitr::kable(cbind(Min=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],min)),3),
                   Mean=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],mean)),3), 
                   Median=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],median)),3),
                   Max=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],max)),3),
                   SD=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],sd)),3),
                   N=table(pheno[,"Sex"])))
Min Mean Median Max SD N
0 0.265 0.302 0.301 0.357 0.031 7
1 0.034 0.049 0.051 0.065 0.010 8

difference in beta methylation values between different Sex

boxplot(CpG.level ~ pheno[,"Sex"], main="Beta-values")

linear regression on betas

summary(lm(CpG.level~pheno[,"Sex"]))$coefficients[2,c("Estimate", "Pr(>|t|)","Std. Error")]
##      Estimate      Pr(>|t|)    Std. Error 
## -2.526291e-01  1.148608e-11  1.149201e-02

comparison with m-values

CpG.mlevel <- log2(betas.clean[j,])-log2(1-betas.clean[j,])

knitr::kable(cbind(Min=round(simplify2array(tapply(CpG.mlevel, pheno[,"Sex"],min)),3),
                   Mean=round(simplify2array(tapply(CpG.mlevel, pheno[,"Sex"],mean)),3), 
                   Median=round(simplify2array(tapply(CpG.mlevel, pheno[,"Sex"],median)),3),
                   Max=round(simplify2array(tapply(CpG.mlevel, pheno[,"Sex"],max)),3),
                   SD=round(simplify2array(tapply(CpG.mlevel, pheno[,"Sex"],sd)),3),
                   N=table(pheno[,"Sex"])))
Min Mean Median Max SD N
0 -1.470 -1.213 -1.213 -0.847 0.207 7
1 -4.813 -4.294 -4.211 -3.837 0.331 8
boxplot(CpG.mlevel ~ pheno[,"Sex"], main="M-values")

linear regression on m-values

summary(lm(CpG.mlevel~pheno[,"Sex"]))$coefficients[2,c("Estimate", "Pr(>|t|)","Std. Error")]
##      Estimate      Pr(>|t|)    Std. Error 
## -3.080827e+00  1.843188e-11  1.454759e-01

we can always extract measures of the relative quality of statistical models - e.g. adjusted R2 - to look at model performance
model on betas

summary(lm(CpG.level~pheno[,"Sex"]))$adj.r.squared
## [1] 0.9717886

model on mvalues

summary(lm(CpG.mlevel~pheno[,"Sex"]))$adj.r.squared
## [1] 0.9696635

EWAS and results using CpGassoc

See Barfield et al. Bioinformatics 2012
sex as predictor
note that CpGassoc is quite fast for running almost a half million regressions!

system.time(results1 <- cpg.assoc(betas.clean, pheno[,"Sex"]))
##    user  system elapsed 
##  49.389   1.518  55.099

there are several components of the results

class(results1)
## [1] "cpg"
names(results1)
## [1] "results"      "Holm.sig"     "FDR.sig"      "info"        
## [5] "indep"        "covariates"   "chip"         "coefficients"

look at a few results
here effect size is ~ mean difference in methylation proportion

head(cbind(results1$coefficients[,4:5], P.value=results1$results[,3]))
effect.size std.error P.value
cg00000957 -0.0179181 0.0107239 0.1186353
cg00001349 -0.0359551 0.0219256 0.1249933
cg00001583 0.0037335 0.0013244 0.0144934
cg00002028 0.0047158 0.0022191 0.0533213
cg00002719 0.0035198 0.0015482 0.0406126
cg00002837 0.0203328 0.0228082 0.3888793

and the top hits

head(cbind(results1$coefficients[,4:5], P.value=results1$results[,3])[order(results1$results[,3]),])
effect.size std.error P.value
cg03691818 0.0810359 0.0034341 0e+00
cg12691488 -0.2526291 0.0114920 0e+00
cg11643285 0.1533963 0.0084080 0e+00
cg25304146 -0.1149856 0.0091679 0e+00
cg02989351 0.0505712 0.0046351 1e-07
cg25294185 -0.0814720 0.0077311 1e-07

check with previous result on our selected CpG (running lm without CpGassoc)

cbind(results1$coefficients[j,4:5], results1$results[j,c(1,3)])
effect.size std.error CPG.Labels P.value
cg12691488 -0.2526291 0.011492 cg12691488 0
summary(lm(CpG.level~pheno[,"Sex"]))
## 
## Call:
## lm(formula = CpG.level ~ pheno[, "Sex"])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.036900 -0.013928 -0.000714  0.005826  0.055298 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.302080   0.008393   35.99 2.09e-14 ***
## pheno[, "Sex"] -0.252629   0.011492  -21.98 1.15e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0222 on 13 degrees of freedom
## Multiple R-squared:  0.9738, Adjusted R-squared:  0.9718 
## F-statistic: 483.3 on 1 and 13 DF,  p-value: 1.149e-11

Bonferroni significant hits

table(results1$results[,3] < 0.05/(nCpG))
## 
##  FALSE   TRUE 
## 427133      7

FDR significant hits

table(results1$results[,5] < 0.05)
## 
##  FALSE   TRUE 
## 427125     15

EWAS with adjustment for cell types now we can run the linear regression on betas adjusting for cell proportions

results2 <- cpg.assoc(betas.clean,pheno[,"Sex"], 
                      covariates=pheno[,"CD8T"]+ pheno[,"CD4T"] +  
                        pheno[,"NK"]  + pheno[,"Bcell"] + 
                        pheno[,"Mono"] + pheno[,"Gran"] + pheno[,"nRBC"])

look at the results

head(cbind(results2$coefficients[,4:5], P.value=results2$results[,3])[order(results2$results[,3]),])
effect.size std.error P.value
cg03691818 0.0787608 0.0039220 0e+00
cg11643285 0.1634361 0.0083174 0e+00
cg12691488 -0.2596565 0.0132322 0e+00
cg25294185 -0.0921646 0.0069914 0e+00
cg25304146 -0.1184640 0.0108542 1e-07
cg23739457 0.0831171 0.0089081 8e-07

Bonferroni significant hits

table(results2$results[,3] < 0.05/(nCpG))
## 
##  FALSE   TRUE 
## 427136      4

FDR significant hits

table(results2$results[,5] < 0.05)
## 
##  FALSE   TRUE 
## 427135      5

we can also see them with:

results2$FDR.sig
CPG.Labels T.statistic P.value Holm.sig FDR gc.p.value
70962 cg25294185 -13.18252 0e+00 TRUE 0.0018002 1e-07
72477 cg03691818 20.08202 0e+00 TRUE 0.0000248 0e+00
133211 cg12691488 -19.62313 0e+00 TRUE 0.0000248 0e+00
180139 cg11643285 19.64992 0e+00 TRUE 0.0000248 0e+00
397058 cg25304146 -10.91411 1e-07 FALSE 0.0117977 6e-07

results

data(IlluminaHumanMethylation450kanno.ilmn12.hg19)
IlluminaAnnot = data.frame(
  chr=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Locations$chr,
  pos=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Locations$pos,
  Relation_to_Island=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Islands.UCSC$Relation_to_Island,
  UCSC_RefGene_Name=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Other$UCSC_RefGene_Name,
  UCSC_RefGene_Group=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Other$UCSC_RefGene_Group)

## Create CpG name and annotate row names
rownames(IlluminaAnnot) <- rownames(IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Manifest)
IlluminaAnnot$Name <-rownames(IlluminaAnnot)
dim(IlluminaAnnot)
## [1] 485512      6
IlluminaAnnot = IlluminaAnnot [intersect(rownames(IlluminaAnnot), rownames(betas.clean)),]
dim(IlluminaAnnot)
## [1] 427140      6
datamanhat <- data.frame(CpG=results2$results[,1],Chr=as.character(IlluminaAnnot$chr),
                         Mapinfo=IlluminaAnnot$pos, UCSC_RefGene_Name=IlluminaAnnot$UCSC_RefGene_Name, 
                         Pval=results2$results[,3], Eff.Size = results2$coefficients[,4], Std.Error = results2$coefficients[,5])

Reformat the variable Chr (so we can simplify and use a numeric x-axis)

datamanhat$Chr <- as.numeric(sub("chr","",datamanhat$Chr))

see where the top hits are

head(datamanhat[order(datamanhat$Pval), ])
CpG Chr Mapinfo UCSC_RefGene_Name Pval Eff.Size Std.Error
72477 cg03691818 12 53085038 KRT77 0e+00 0.0787608 0.0039220
180139 cg11643285 3 16411667 RFTN1 0e+00 0.1634361 0.0083174
133211 cg12691488 1 243053673 0e+00 -0.2596565 0.0132322
70962 cg25294185 11 65487814 RNASEH2C 0e+00 -0.0921646 0.0069914
397058 cg25304146 18 30092971 WBP11P1 1e-07 -0.1184640 0.0108542
70468 cg23739457 11 117314707 DSCAML1 8e-07 0.0831171 0.0089081

Genomic inflation in EWAS

qqplot and lambda interpretation

par(mfrow=c(1,1))
plot(results1, main="QQ plot for association between methylation and sex")

plot(results2, main="QQ plot for association between methylation and sex \n adjusted for cell proportions")

Lambda - this is a summary measure of genomic inflation
ratio of observed vs expected median p-value - is there early departure of the qqline?
estimated at -log10(0.5) ~ 0.3 on the x-axis of a qqplot

lambda <- function(p) median(qchisq(p, df=1, lower.tail=FALSE), na.rm=TRUE) / qchisq(0.5, df=1)

Lambda for the first EWAS

lambda(results1$results[,3])
## [1] 3.201929

Lambda after cell type adjustment

lambda(results2$results[,3])
## [1] 1.230098

Volcano Plot-results2 with Bonferroni threshold and current FDR

plot(results2$coefficients[,4],-log10(results2$results[,3]), 
     xlab="Estimate", ylab="-log10(Pvalue)", main="Volcano Plot \n adjusted for cell proportions")
#Bonferroni threshold & FDR threshold
abline(h = -log10(0.05/(nCpG)), lty=1, col="red", lwd=2)
abline(h = -log10(max(results2$results[results2$results[,5] < 0.05,3])), lty=1, col="blue", lwd=2)

Manhattan plot for cell-type adjusted EWAS

the function manhattan needs data.frame including CpG, Chr, MapInfo and Pvalues

manhattan(datamanhat,"Chr","Mapinfo", "Pval", "CpG", 
          suggestiveline = -log10(max(results2$results[results2$results[,5] < 0.05,3])), 
          genomewideline = -log10(0.05/(nCpG)), 
          main = "Manhattan Plot \n adjusted for cell proportions")

cleanup

rm(j, nCpG, CpG.name, CpG.level, CpG.rlm, CpG.mlevel, lm.fit.rob.bayes, datamanhat, IlluminaAnnot, IlluminaHumanMethylation450kanno.ilmn12.hg19, lambda)
## Warning in rm(j, nCpG, CpG.name, CpG.level, CpG.rlm, CpG.mlevel,
## lm.fit.rob.bayes, : object 'CpG.rlm' not found
## Warning in rm(j, nCpG, CpG.name, CpG.level, CpG.rlm, CpG.mlevel,
## lm.fit.rob.bayes, : object 'lm.fit.rob.bayes' not found

End of script 02

Regional DNA methylation analysis using DMRcate

Using data preprocessed in our script:
meth01_process_data.R & meth02_process_data.R

we have already set up our analysis

if(!exists("pheno")){
  source("code/meth02_analyze_data.R")
}

Load package for regional analysis “DMRcate” see Peters et al. Bioinformatics 2015.
Other popular options for conducting Regional DNA methylation analysis in R are Aclust and bumphunter

suppressMessages(library(DMRcate)) # Popular package for regional DNA methylation analysis

First we need to define a model

model <- model.matrix(~as.factor(pheno$Sex)+
                        as.numeric(pheno$CD8T)+
                        as.numeric(pheno$NK)+
                        as.numeric(pheno$Bcell)+
                        as.numeric(pheno$Mono)+
                        as.numeric(pheno$Gran)+
                        as.numeric(pheno$nRBC))

Let’s run the regional analysis using the Mvals from our preprocessed data

myannotation <- cpg.annotate("array", Mvals.ComBat, analysis.type="differential",
                             design=model, coef=2)
## Your contrast returned 8161 individually significant probes. We recommend the default setting of pcutoff in dmrcate().

Regions are now agglomerated from groups of significant probes where the distance to the next consecutive probe is less than lambda nucleotides away

dmrcoutput.sex <- suppressMessages(dmrcate(myannotation, lambda=1000, C=2))

Let’s look at the results

head(dmrcoutput.sex$results)
coord no.cpgs minfdr Stouffer maxbetafc meanbetafc
1404 chrX:139583634-139590479 31 0 0 0.5359809 0.3020102
433 chrX:48754200-48756592 27 0 0 0.4763541 0.3128542
667 chrX:66762467-66766506 29 0 0 0.4929935 0.1530296
1616 chrX:153774721-153776358 21 0 0 0.5240904 0.3460454
686 chrX:68046595-68050216 31 0 0 0.5833820 0.2828564
1454 chrX:149529976-149534258 19 0 0 0.4663069 0.3312878

Visualizing the data can help us understand where the region lies relative to promoters, CpGs islands or enhancers Let’s extract the genomic ranges and annotate to the genome

results.ranges <- extractRanges(dmrcoutput.sex, genome = "hg19")

Plot the DMR using the Gviz if you are interested in plotting genomic data the Gviz is extremely useful Let’s look at the first region

results.ranges[1]
## GRanges object with 1 range and 6 metadata columns:
##                            seqnames                 ranges strand |
##                               <Rle>              <IRanges>  <Rle> |
##   chrX:139583634-139590479     chrX [139583634, 139590479]      * |
##                              no.cpgs    minfdr      Stouffer maxbetafc
##                            <integer> <numeric>     <numeric> <numeric>
##   chrX:139583634-139590479        31         0 2.403026e-171 0.5359809
##                            meanbetafc overlapping.promoters
##                             <numeric>           <character>
##   chrX:139583634-139590479  0.3020102              SOX3-001
##   -------
##   seqinfo: 14 sequences from an unspecified genome; no seqlengths
pheno$sex <- ifelse(pheno$Sex==1, "Female", "Male")
groups <- c(Female="magenta", Male="forestgreen")
cols <- groups[as.character(pheno$sex)]
DMR.plot(ranges=results.ranges, dmr=1, CpGs=betas.rcp, phen.col=cols, genome="hg19")

Extracting CpGs-names and locations

chr <- gsub(":.*", "", dmrcoutput.sex$results$coord[1])
start <- gsub("-.*", "", gsub(".*:", "", dmrcoutput.sex$results$coord[1]))
end <- gsub(".*-", "", dmrcoutput.sex$results$coord[1])

CpG ID and individual metrics

cpgs <- dmrcoutput.sex$input[dmrcoutput.sex$input$CHR %in% chr & dmrcoutput.sex$input$pos >= start & dmrcoutput.sex$input$pos <=end,]
knitr::kable(cpgs[1:5,])
ID weights CHR pos betafc indfdr is.sig raw fdr sig step.dmr
482112 cg09031836 19.09928 chrX 139583634 0.4202986 0.0000001 TRUE 0 0 TRUE TRUE
482113 cg04481865 19.60292 chrX 139584049 0.2703162 0.0000001 TRUE 0 0 TRUE FALSE
482114 cg00411843 14.85628 chrX 139584448 0.2762487 0.0000012 TRUE 0 0 TRUE FALSE
482115 cg14933706 14.05387 chrX 139584816 0.1902826 0.0000021 TRUE 0 0 TRUE TRUE
482116 cg21306973 19.52035 chrX 139584886 0.3489712 0.0000001 TRUE 0 0 TRUE TRUE

End of script 03